## #########################################################
## REPLICATION DATA FOR:                                  ##
## Chou, Winston, Kosuke Imai, and Bryn Rosenfeld. 2017.  ##
## Sensitive Survey Questions with Auxiliary Information  ##
## Forthcoming in Sociological Methods & Research         ##
## #########################################################

############
# PREAMBLE #
############

rm(list = ls())

#setwd("YOUR-WORKING-DIRECTORY")

library("magic")
library("arm")
library("MASS")
library("xtable")
library("list")
library("rr")
library("endorse")

set.seed(4444)

## LOAD DATA
load("list-data.RData")
load("rr-data.RData")
load("end-data.RData")

## AUXILIARY INFORMATION
load("ms-g11-county-vote-data.RData")

vote <- vote[vote$insample == 1, ]
aux.state <- with(vote, sum(YES_TOT)/sum(size))
aux.counties <- vote$yes26

load("population_data.RData")

## LISTWISE DELETION
#  LIST
list <- list[is.na(list$list.y)==F,]
list <- list[is.na(list$dem)==F,]
list <- list[is.na(list$gop)==F,]
list <- list[is.na(list$maleNA)==F,]
list <- list[is.na(list$ageNA)==F,]

#  RANDOMIZED RESPONSE
rr <- rr[is.na(rr$rr.dum)==F,]
rr <- rr[is.na(rr$dem)==F,]
rr <- rr[is.na(rr$gop)==F,]
rr <- rr[is.na(rr$maleNA)==F,]
rr <- rr[is.na(rr$ageNA)==F,]
rr <- rr[is.na(rr$age55plus)==F,]

## DESIGN PARAMETERS
#  LIST
J <- 4
#  RANDOMIZED RESPONSE
p <- .5
p1 <- .5
p0 <- 0

## NUMERIC GROUP INDICATORS
list$insample <- rep(1,nrow(list))
list$numeric.county <- as.numeric(list$county)
rr$insample <- rep(1,nrow(rr))
rr$numeric.county <- as.numeric(rr$county)

#### LIST EXPERIMENT
###  NO CONSTRAINTS
list.no.aux <- ictreg(list.y ~ dem + gop + maleNA + ageNA, 
                    data = list, treat = "treat", J = J,
                    method = "nls")

###  OVERALL CONSTRAINT
h <- c("1" = aux.state)
group <- list$insample

list.aux <- ictreg(list.y ~  dem + gop + maleNA + ageNA, 
                  data = list, treat = "treat", J = J,  
                  method = "nls", h = h, group = group, 
                  maxIter = 10000)

###  COUNTY CONSTRAINTS
h <- aux.counties
names(h) <- 1:19
group <- list$numeric.county

list.aux.county <- ictreg(list.y ~  dem + gop + maleNA + ageNA, 
                         data = list, treat = "treat", J = J,  
                         method = "nls", h = h, group = group, 
                         maxIter = 10000)




#### RANDOMIZED RESPONSE EXPERIMENT
###  NO CONSTRAINTS
while(!exists("rr.no.aux")) {
  try(
    rr.no.aux <- rrreg(rr.dum ~ dem + gop + maleNA + ageNA + age55plus, 
      data = rr, p = p, p1 = p1, p0 = p0, design = "forced-known")
    )
}

###  OVERALL CONSTRAINTS
h <- c("1" = aux.state)
group <- rr$insample

rr.aux <- rrreg(rr.dum ~ dem + gop + maleNA + ageNA + age55plus, 
                    data = rr, p = p, p1 = p1, p0 = p0, design = "forced-known", 
                    h = h, group = as.character(group), start = coef(rr.no.aux))



###  COUNTY CONSTRAINTS
h <- aux.counties
names(h) <- 1:19
group <- rr$numeric.county

rr.aux.county <- rrreg(rr.dum ~ dem + gop + maleNA + ageNA + age55plus, 
                    data = rr, p = p, p1 = p1, p0 = p0, design = "forced-known", 
                    h = h, group = as.character(group), start = coef(rr.no.aux))


##### SPECIFICATION TEST
list.table <- cbind(coef(list.no.aux)[1:5], sqrt(diag(vcov(list.no.aux)))[1:5], 
  coef(list.aux.county)[1:5], sqrt(diag(vcov(list.aux.county)))[1:5])
list.table <- rbind(list.table, rep(NA, 4))
list.table <- rbind(list.table, c(NA, NA, list.aux.county$J.stat, list.aux.county$overid.p))
colnames(list.table) <- c("Est.", "S.E.", "Est.", "S.E.")
rownames(list.table) <- c("(Intercept)", "Democrat", "Republican", 
  "Male", "Age", "Aged 55 and over", "Hansen test")

rr.table <- cbind(coef(rr.no.aux), sqrt(diag(vcov(rr.no.aux))), 
  coef(rr.aux.county), sqrt(diag(vcov(rr.aux.county))))
rr.table <- rbind(rr.table, c(NA, NA, rr.aux.county$J.stat, 
  rr.aux.county$overid.p))
colnames(rr.table) <- c("Est.", "S.E.", "Est.", "S.E.")
rownames(rr.table) <- c("(Intercept)", "Democrat", "Republican", 
  "Male", "Age", "Aged 55 and over", "Hansen test")

## ###########
## TABLE 2 
## SPECIFICATION TEST
## ###########
table2 <- cbind(list.table, rr.table)
xtable(table2)




#### LIST PREDICTIONS VIA POSTSTRATIFICATION
## Get SEs by quai-Bayesian simulation
# setting up 
x.pop <- pop.full
x.pop <- subset(x.pop, select=c(county, dem, gop, maleNA, ageNA))
x.pop <- data.frame(rep(1,nrow(x.pop)), x.pop)

n <- 1:19

sims <- 1000

# Beta matrix is nsims x 5
noaux.par.treat <- mvrnorm(n=sims, 
                           mu=coef(list.no.aux)[1:(length(coef(list.no.aux))/2)], 
                           Sigma=vcov(list.no.aux)[1:(length(coef(list.no.aux))/2),1:(length(coef(list.no.aux))/2)])
aux.par.treat <- mvrnorm(n=sims, 
                           mu=coef(list.aux)[1:(length(coef(list.aux))/2)], 
                           Sigma=vcov(list.aux)[1:(length(coef(list.aux))/2),1:(length(coef(list.aux))/2)])
aux.par.treat.county <- mvrnorm(n=sims, 
                                mu=coef(list.aux.county)[1:(length(coef(list.aux.county))/2)], 
                                Sigma=vcov(list.aux.county)[1:(length(coef(list.aux.county))/2),1:(length(coef(list.aux.county))/2)])  

county.pred.noaux <- array(NA, c(3,length(n)))
county.pred.aux <- array(NA, c(3,length(n)))
county.pred.aux.county <- array(NA, c(3,length(n)))

for (i in n) {
  
    # Create individual-level matrix of covariates for each county (X is n.obs x 5)
    x <- x.pop[as.numeric(x.pop$county)==i,]
    x <- x[,-2]  
    county.pred.noaux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(noaux.par.treat))), probs=c(.5, .025, .975))
    county.pred.aux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat))), probs=c(.5, .025, .975))
    county.pred.aux.county[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat.county))), probs=c(.5, .025, .975))
    
}


####
## Predict precinct-level vote 
## Post-stratified estimates
####

# Load precinct-level vote data
load("ms-g11-precinct-vote-data.RData")

# Create county_precinct variable with no missing values
vote <- vote[is.na(vote$precinct)==F,]
vote$county_precinct <- with(vote, factor(paste(COUNTY, precinct, sep="/")))
vote$county_precinct_numeric <- as.numeric(vote$county_precinct)

# Create county-precinct variable in population data
pop.full$county_precinct <- with(pop.full, factor(paste(county, precinct, sep="/")))

# Merging numeric county-precinct variable from voting data and restricting population data to sampled precincts
pop <- merge(subset(vote, select=c(county_precinct, county_precinct_numeric)), pop.full, by="county_precinct")

## Get SEs using quasi-Bayesian simulation
# setting up
x.pop <- pop
x.pop <- subset(x.pop, select=c(county_precinct_numeric, dem, gop, maleNA, ageNA))
x.pop <- data.frame(rep(1,nrow(x.pop)), x.pop)

n <- 1:length(vote$county_precinct_numeric)

prec.pred.noaux <- array(NA, c(3,length(n)))
prec.pred.aux <- array(NA, c(3,length(n)))
prec.pred.aux.county <- array(NA, c(3,length(n)))

se.noaux <- array(NA, c(1,length(n)))
se.aux <- array(NA, c(1,length(n)))
se.aux.county <- array(NA, c(1,length(n)))

counter <- 0

for (i in n) {
  
  # Create individual-level matrix of covariates for each county
  x <- x.pop[x.pop$county_precinct_numeric==i,]
  x <- x[,-2]  
  prec.pred.noaux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(noaux.par.treat)), na.rm=T), probs=c(.5, .025, .975))
  se.noaux[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(noaux.par.treat)), na.rm=T))
  prec.pred.aux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat)), na.rm=T), probs=c(.5, .025, .975))
  se.aux[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat)), na.rm=T))
  prec.pred.aux.county[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat.county)), na.rm=T), probs=c(.5, .025, .975))
  se.aux.county[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat.county)), na.rm=T))
  
  counter <- counter + 1
  if (counter == 1) cat("Iteration: \n", counter) else cat(", ", counter)
}

write.table(prec.pred.noaux, file="prec_pred_noaux_bayes.txt")
write.table(prec.pred.aux, file="prec_pred_aux_bayes.txt")
write.table(prec.pred.aux.county, file="prec_pred_aux_county_bayes.txt")

write.table(se.noaux, file="prec_pred_noaux_se_list.txt") 
write.table(se.aux, file="prec_pred_aux_se_list.txt") 
write.table(se.aux.county, file="prec_pred_aux_county_se_list.txt") 

#####
## PLOT RESULTS
#####

results.list <- list(county.pred.noaux,
                county.pred.aux,
                county.pred.aux.county,
                prec.pred.noaux,
                prec.pred.aux,
                prec.pred.aux.county)
approach <- c("county.noaux",
              "county.aux.state",
              "county.aux.county",
              "prec.noaux",
              "prec.aux.state",
              "prec.aux.county")

#Calculate rmse
rmse <- NULL
for (i in 1:3) {
  print(i)
  rmse[i] <- sqrt(mean((results.list[[i]][1,] - aux.counties)^2))
}
for (i in 4:6) {
  print(i)
  rmse[i] <- sqrt(mean((results.list[[i]][1,] - vote[order(vote$county_precinct_numeric), ]$yes26)^2))
}


#Calculate bias
bias <- NULL
for (i in 1:3) {
  bias[i] <- (1-mean(results.list[[i]][1,])) - (1-mean(aux.counties)) 
}
for (i in 4:6) {
  bias[i] <- (1-mean(results.list[[i]][1,])) - (1-mean(vote$yes26))
}

#Calculate correlation
cor <- NULL
for (i in 1:3) {
  cor[i] <- cor(aux.counties, results.list[[i]][1,])
  print(cor.test(aux.counties, results.list[[i]][1,]))
}
for (i in 4:6) {
  cor[i] <- cor(vote[order(vote$county_precinct_numeric), ]$yes26, results.list[[i]][1,])
  print(cor.test(vote[order(vote$county_precinct_numeric), ]$yes26, results.list[[i]][1,]))
}

tab <- data.frame(approach, rmse, bias, cor)
print(tab)

tab.list <- tab

## #########
## FIGURE 1
## #########

#Open plot
pdf("figure-1.pdf", width = 7, height = 9)
{
      par(mfrow = c(3, 2),
          las = 1, 
          mar = c(3, 2.5, 0.1, 0.1), 
          oma = c(1.5, 2.5, 2, 1.5),
          mgp = c(1.8, 0.5, 0),
          xpd = FALSE,
          tck = -0.02, 
          cex = 0.8, 
          pty = "sq"
      )

      truth <- list(1-aux.counties, 1-aux.counties, 1-aux.counties, 1-vote$yes26, 1-vote$yes26, 1-vote$yes26)
      pch <- 19
      
      for (i in c(4, 6)){
        est <- 1-results.list[[i]][1,]
        low.est <- 1-results.list[[i]][2,]
        up.est <- 1-results.list[[i]][3,]
        plot(truth[[i]], est, type = "p", ylab = "", xlab = "", 
             col=rgb(0,0,0,alpha=1),
             pch = pch, 
             main="",
             xlim=c(0,1),
             ylim=c(0,1))
        mtext(text = "Actual", side = 1, line = 1.5, las = 0)
        mtext(text = "Estimate", side = 2, line = 2.25, las = 0)
        abline(0,1, col="red")
        segments(truth[[i]], low.est, truth[[i]], up.est, lwd =  0.5, col=rgb(0,0,0,alpha=1))
        text(x = -0.015, y = 0.94, paste("bias = ", round(tab.list[i,3], 3), "\nRMSE = ", round(tab.list[i,2] ,3), "\ncor = ", round(tab.list[i,4] ,3), sep=""), 
           adj=0, cex = 0.8)
        abline(lm(est~truth[[i]]))
      }
      
      mtext("Without Auxiliary Information", side = 3, line = 0.5, outer = TRUE, at=.28, font=2, las = 0) 
      mtext("With Auxiliary Information", side = 3, line = 0.5, outer = TRUE, at=.78, font=2, las = 0) 

      mtext("List Experiment", side = 2, line = 0.75, outer = TRUE, at = .86, font=2, las = 0) 
      mtext("Randomized Response", side = 2, line = 0.75, outer = TRUE, at = .525, font=2, las = 0) 
      mtext("Endorsement Experiment", side = 2, line = 0.75, outer = TRUE, at=.2, font=2, las = 0) 


}

#### RR PREDICTIONS FROM POSTSTRATIFICATION
## Get SEs by quai-Bayesian simulation
# setting up 
x.pop <- pop.full
x.pop <- subset(x.pop, select=c(county, dem, gop, maleNA, ageNA, age55plus))
x.pop <- data.frame(rep(1,nrow(x.pop)), x.pop)

n <- 1:19

sims <- 1000

# Beta matrix is nsims x 5
noaux.par.treat <- mvrnorm(n=sims, 
                           mu=coef(rr.no.aux), 
                           Sigma=vcov(rr.no.aux))
aux.par.treat <- mvrnorm(n=sims, 
                           mu=coef(rr.aux), 
                           Sigma=vcov(rr.aux))
aux.par.treat.county <- mvrnorm(n=sims, 
                                mu=coef(rr.aux.county), 
                                Sigma=vcov(rr.aux.county))  


county.pred.noaux <- array(NA, c(3,length(n)))
county.pred.aux <- array(NA, c(3,length(n)))
county.pred.aux.county <- array(NA, c(3,length(n)))

for (i in n) {
  
  # Create individual-level matrix of covariates for each county (X is n.obs x 5)
  x <- x.pop[as.numeric(x.pop$county)==i,]
  x <- x[,-2]  
  county.pred.noaux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(noaux.par.treat))), probs=c(.5, .025, .975))
  county.pred.aux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat))), probs=c(.5, .025, .975))
  county.pred.aux.county[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat.county))), probs=c(.5, .025, .975))
  
}

####
## Predict precinct-level vote 
## Post-stratified estimates
####

## Get SEs using quasi-Bayesian simulation
# setting up
x.pop <- subset(pop, select=c(county_precinct_numeric, dem, gop, maleNA, ageNA, age55plus))
x.pop <- data.frame(rep(1,nrow(x.pop)), x.pop)

n <- 1:length(vote$county_precinct_numeric)

prec.pred.noaux <- array(NA, c(3,length(n)))
prec.pred.aux <- array(NA, c(3,length(n)))
prec.pred.aux.county <- array(NA, c(3,length(n)))

se.noaux <- array(NA, c(1,length(n)))
se.aux <- array(NA, c(1,length(n)))
se.aux.county <- array(NA, c(1,length(n)))

counter <- 0

for (i in n) {
  
  # Create individual-level matrix of covariates for each county
  x <- x.pop[x.pop$county_precinct_numeric==i,]
  x <- x[,-2]  
  prec.pred.noaux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(noaux.par.treat)), na.rm=T), probs=c(.5, .025, .975))
  se.noaux[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(noaux.par.treat)), na.rm=T))
  prec.pred.aux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat)), na.rm=T), probs=c(.5, .025, .975))
  se.aux[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat)), na.rm=T))
  prec.pred.aux.county[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat.county)), na.rm=T), probs=c(.5, .025, .975))
  se.aux.county[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(aux.par.treat.county)), na.rm=T))
  
  counter <- counter + 1
  if (counter == 1) cat("Iteration: \n", counter) else cat(", ", counter)
}

write.table(prec.pred.noaux, file="prec_pred_noaux_bayes_rr.txt")
write.table(prec.pred.aux, file="prec_pred_aux_bayes_rr.txt")
write.table(prec.pred.aux.county, file="prec_pred_aux_county_bayes_rr.txt")

write.table(se.noaux, file="prec_pred_noaux_se_rr.txt") 
write.table(se.aux, file="prec_pred_aux_se_rr.txt") 
write.table(se.aux.county, file="prec_pred_aux_county_se_rr.txt")

#####
## PLOT RESULTS
#####

results.rr <- list(county.pred.noaux,
                county.pred.aux,
                county.pred.aux.county,
                prec.pred.noaux,
                prec.pred.aux,
                prec.pred.aux.county)
approach <- c("county.noaux",
              "county.aux.state",
              "county.aux.county",
              "prec.noaux",
              "prec.aux.state",
              "prec.aux.county")

#Calculate rmse
rmse <- NULL
for (i in 1:3) {
  print(i)
  rmse[i] <- sqrt(mean((results.rr[[i]][1,] - aux.counties)^2))
}
for (i in 4:6) {
  print(i)
  rmse[i] <- sqrt(mean((results.rr[[i]][1,] - vote[order(vote$county_precinct_numeric), ]$yes26)^2))
}


#Calculate bias
bias <- NULL
for (i in 1:3) {
  bias[i] <- (1-mean(results.rr[[i]][1,])) - (1-mean(aux.counties)) 
}
for (i in 4:6) {
  bias[i] <- (1-mean(results.rr[[i]][1,])) - (1-mean(vote$yes26))
}

#Calculate correlation
cor <- NULL
for (i in 1:3) {
  cor[i] <- cor(aux.counties, results.rr[[i]][1,])
  print(cor.test(aux.counties, results.rr[[i]][1,]))
}
for (i in 4:6) {
  cor[i] <- cor(vote[order(vote$county_precinct_numeric), ]$yes26, results.rr[[i]][1,])
  print(cor.test(vote[order(vote$county_precinct_numeric), ]$yes26, results.rr[[i]][1,]))
}

tab <- data.frame(approach, rmse, bias, cor)
print(tab)

tab.rr <- tab

# Plot
  {

    truth <- list(1-aux.counties, 1-aux.counties, 1-aux.counties, 1-vote$yes26, 1-vote$yes26, 1-vote$yes26)
    pch <- 19
    
    for (i in c(4, 6)){
      est <- 1-results.rr[[i]][1,]
      low.est <- 1-results.rr[[i]][2,]
      up.est <- 1-results.rr[[i]][3,]
      plot(truth[[i]], est, type = "p", ylab = "", xlab = "", 
           col=rgb(0,0,0,alpha=1),
           pch = pch, 
           main="",
           xlim=c(0,1),
           ylim=c(0,1))
      mtext(text = "Actual", side = 1, line = 1.5, las = 0)
      mtext(text = "Estimate", side = 2, line = 2.25, las = 0)
      abline(0,1, col="red")
      segments(truth[[i]], low.est, truth[[i]], up.est, lwd =  0.5, col=rgb(0,0,0,alpha=1))
      text(x = -0.015, y = 0.94, paste("bias = ", round(tab.rr[i,3], 3), "\nRMSE = ", round(tab.rr[i,2] ,3), "\ncor = ", round(tab.rr[i,4] ,3), sep=""), 
         adj=0, cex = 0.8)
      abline(lm(est~truth[[i]]))
    }
    
  }
## #########
## ENDORSMENT EXPERIMENT 
## #########

load("ms-g11-county-vote-data.RData")

# Select sampled counties from voting data
vote <- vote[vote$insample == 1, ]

# define auxiliary moments (in terms of ’no' vote)
# vote return for each of 19 counties (vector of county returns in alphabetical order)
aux.counties <- 1-vote$yes26

Y <- list(Q1 = c("end.bryant.c", "end.bryant.t"))

MCMC <- 50000
burn <- 25000
thin <- 10
omega2.start <- runif(4)
start.value <- rnorm(4, sd = .6)
chains <- 1:4
vil.formula <- formula( ~ -1 + factor(county))
data.village <- subset(end, select = c("county", "county_precinct"))
data.village <- unique(data.village)

#### ESTIMATION

## Column 1: No auxiliary information
endorse.out1 <- vector('list', length(chains))
for (i in chains) {
  set.seed(i)
  endorse.out1[[i]] <- endorse(Y = Y,
                              data = end,
                              seed.store=TRUE,
                              MCMC = MCMC,
                              burn = burn,
                              thin = thin,
                              prop = .03,
                              identical.lambda = TRUE,
                              hierarchical = TRUE,
                              formula.village = vil.formula,
                              data.village = data.village,
                              village = "county_precinct",
                              omega2.start = omega2.start[i],
                              beta.start = matrix(start.value[i], nrow = 1, ncol = 2))
  holder <- endorse.out1[[i]]
}
#  check for convergence
chain1 <- mcmc(cbind(endorse.out1[[1]]$lambda, 
                     endorse.out1[[1]]$omega2), start = burn + 1, thin = thin)
chain2 <- mcmc(cbind(endorse.out1[[2]]$lambda, 
                     endorse.out1[[2]]$omega2), start = burn + 1, thin = thin)
chain3 <- mcmc(cbind(endorse.out1[[3]]$lambda, 
                     endorse.out1[[3]]$omega2), start = burn + 1, thin = thin)
chain4 <- mcmc(cbind(endorse.out1[[4]]$lambda, 
                     endorse.out1[[4]]$omega2), start = burn + 1, thin = thin)

check.R.hat <- mcmc.list(chain1, chain2, chain3, chain4)

R.hat <- gelman.diag(check.R.hat)

R.hat

endorse.out.new <- endorse.out1

# # #### STORE DATA

no.aux.lambda <- rbind(endorse.out.new[[1]]$lambda, 
                       endorse.out.new[[2]]$lambda, 
                       endorse.out.new[[3]]$lambda,
                       endorse.out.new[[4]]$lambda)

no.aux.omega2 <- rbind(endorse.out.new[[1]]$omega2, 
                       endorse.out.new[[2]]$omega2, 
                       endorse.out.new[[3]]$omega2,
                       endorse.out.new[[4]]$omega2)


write.csv(no.aux.lambda, "no-aux-lambda-new.csv", row.names = FALSE)
write.csv(no.aux.omega2, "no-aux-omega2-new.csv", row.names = FALSE)

## Column 2: County results
endorse.out2 <- vector('list', length(chains))
for (i in chains) {
  set.seed(i)
  endorse.out2[[i]] <- endorse(Y = Y,
                              data = end,
                              seed.store=TRUE,
                              MCMC = MCMC,
                              burn = burn,
                              thin = thin,
                              prop = .03,
                              identical.lambda = TRUE,
                              hierarchical = TRUE,
                              formula.village = vil.formula,
                              h = aux.counties,
                              group = "county",
                              data.village = data.village,
                              village = "county_precinct",
                              omega2.start = omega2.start[i],
                              beta.start = matrix(start.value[i], nrow = 1, ncol = 2))
  holder <- endorse.out2[[i]]
}
#  check for convergence
chain1 <- mcmc(cbind(endorse.out2[[1]]$lambda, 
                     endorse.out2[[1]]$omega2), start = burn + 1, thin = thin)
chain2 <- mcmc(cbind(endorse.out2[[2]]$lambda, 
                     endorse.out2[[2]]$omega2), start = burn + 1, thin = thin)
chain3 <- mcmc(cbind(endorse.out2[[3]]$lambda, 
                     endorse.out2[[3]]$omega2), start = burn + 1, thin = thin)
chain4 <- mcmc(cbind(endorse.out2[[4]]$lambda, 
                     endorse.out2[[4]]$omega2), start = burn + 1, thin = thin)

check.R.hat <- mcmc.list(chain1, chain2, chain3, chain4)

R.hat <- gelman.diag(check.R.hat)

R.hat

endorse.out.new <- endorse.out2

# # #### STORE DATA

county.lambda <- rbind(endorse.out.new[[1]]$lambda, 
                       endorse.out.new[[2]]$lambda, 
                       endorse.out.new[[3]]$lambda,
                       endorse.out.new[[4]]$lambda)

county.omega2 <- rbind(endorse.out.new[[1]]$omega2, 
                       endorse.out.new[[2]]$omega2, 
                       endorse.out.new[[3]]$omega2,
                       endorse.out.new[[4]]$omega2)


write.csv(county.lambda, "county-lambda-new.csv", row.names = FALSE)
write.csv(county.omega2, "county-omega2-new.csv", row.names = FALSE)

#### LOAD STORED CHAINS
no.aux.lambda <- read.csv("no-aux-lambda-new.csv", header = TRUE)
no.aux.omega2 <- read.csv("no-aux-omega2-new.csv", header = TRUE)

county.lambda <- read.csv("county-lambda-new.csv", header = TRUE)
county.omega2 <- read.csv("county-omega2-new.csv", header = TRUE)

###############################
# CALCULATE SD OF PREDICTIONS #
###############################
no.aux.prec.preds <- 1 - pnorm(sweep(x = as.matrix(no.aux.lambda), MARGIN = 1, STATS = sqrt(no.aux.omega2[, 1]), FUN = "/"))
se.end.noaux <- mean(apply(no.aux.prec.preds, 2, sd))
# 0.352

prec.pred.noaux <- apply(no.aux.prec.preds, 2, mean)

county.prec.preds <- 1 - pnorm(sweep(x = as.matrix(county.lambda), MARGIN = 1, STATS = sqrt(county.omega2[, 1]), FUN = "/"))
se.end.aux <- mean(apply(county.prec.preds, 2, sd))
# 0.283

prec.pred.aux <- apply(county.prec.preds, 2, mean)

{

  #  Validate against precincts
  names(prec.pred.noaux) <- data.village$county_precinct
  prec.pred.noaux <- prec.pred.noaux[order(names(prec.pred.noaux))]
  load("ms-g11-precinct-vote-data.Rdata")
  vote <- vote[vote$county_precinct_recode %in% names(prec.pred.noaux),]
  vote <- vote[order(vote$county_precinct), ]
  actual.precinct <- 1 - vote$yes26
  est.precinct <- 1 - prec.pred.noaux[names(prec.pred.noaux) %in% vote$county_precinct]

  plot(x = actual.precinct, y = est.precinct, xlim = c(0, 1), ylim = c(0, 1), pch = 16, 
    xlab = "", ylab = "")
  abline(a = 0, b = 1, col = "red")
  mtext(text = "Actual", side = 1, line = 1.5, las = 0)
  mtext(text = "Estimate", side = 2, line = 2.25, las = 0)
  credible.ints <- apply(sweep(no.aux.lambda, 1, sqrt(no.aux.omega2[, 1]), "/"), 2, quantile, probs = c(0.025, 0.975))
  for (i in 1:length(actual.precinct)) {
    lines(x = rep(actual.precinct[i], 2), y = pnorm(credible.ints[,i]), 
          col = rgb(0, 0, 0, alpha = 0.2))
  }
  abline(lm(est.precinct~actual.precinct))
  text(x = -0.015, y = 0.94, paste("bias = ", round(mean(est.precinct - actual.precinct), 3), 
    "\nRMSE = ", round(sqrt(mean((est.precinct - actual.precinct)^2)), 3), 
      "\ncor = ", round(cor(est.precinct, actual.precinct), 3), sep=""), 
         cex = 0.8, adj = 0)

  names(prec.pred.aux) <- data.village$county_precinct
  prec.pred.aux <- prec.pred.aux[order(names(prec.pred.aux))]
  est.precinct <- 1 - prec.pred.aux[names(prec.pred.aux) %in% vote$county_precinct]

  plot(x = actual.precinct, y = est.precinct, xlim = c(0, 1), ylim = c(0, 1), pch = 16, 
       xlab = "", ylab = "")
  mtext(text = "Actual", side = 1, line = 1.5, las = 0, cex = 1)
  abline(a = 0, b = 1, col = "red")
  cor(actual.precinct, est.precinct, use = "pairwise.complete.obs")
  credible.ints <- apply(sweep(county.lambda, 1, sqrt(county.omega2[, 1]), "/"), 2, quantile, probs = c(0.025, 0.975))
  for (i in 1:length(actual.precinct)) {
    lines(x = rep(actual.precinct[i], 2), y = pnorm(credible.ints[,i]), 
          col = rgb(0, 0, 0, alpha = 0.2))
  }

  text(x = -0.015, y = 0.94, paste("bias = ", round(mean(est.precinct - actual.precinct, na.rm = TRUE), 3), 
                                   "\nRMSE = ", round(sqrt(mean((est.precinct - actual.precinct)^2, na.rm = TRUE)), 3), 
                                   "\ncor = ", round(cor(est.precinct, actual.precinct, use = "pairwise.complete.obs"), 3), sep=""), 
       cex = 0.8, adj = 0)
  abline(lm(est.precinct~actual.precinct))

}

dev.off()

## ########
## TABLE 1 - EFFICIENCY COMPARISON
## ########

## DRAW DIRECT SAMPLES OF SIZE EQUAL TO INDIRECT METHODS 

### SAMPLE
# From direct, take all 255 who recieved dir q first and draw remaining 
# from those who got question second or third, s.t. 
# total sample size is equal to each of the indirect methods

load("dir-data.RData")

#  DIRECT
dir <- dir[is.na(dir$dir)==F,]
dir <- dir[is.na(dir$dem)==F,]
dir <- dir[is.na(dir$gop)==F,]
dir <- dir[is.na(dir$maleNA)==F,]
dir <- dir[is.na(dir$ageNA)==F,]
dir <- dir[is.na(dir$age55plus)==F,]

d1 <- subset(dir, condition=="D")  
d <- subset(dir, condition!="D")

# SAMPLE SIZE EQUAL TO LIST
nrow(list)
n <- nrow(list)-nrow(d1)
d <- d[sample(1:nrow(d), n, replace=FALSE),] 
dir.list <- rbind(d1, d)
dim(dir.list)

# SAMPLE SIZE EQUAL TO RR
nrow(rr)
d <- subset(dir, condition!="D")
n <- nrow(rr)-nrow(d1)
d <- d[sample(1:nrow(d), n, replace=FALSE),] 
dir.rr <- rbind(d1, d)
dim(dir.rr)

## DIRECT QUESTIONING, USING LIST SAMPLE SIZE
dir.logit <- glm(dir ~ dem + gop + maleNA + ageNA, family=binomial(link=logit), data=dir.list)

# Beta matrix is nsims x 5
coef.draws <- mvrnorm(n=1000, 
                      mu=coef(dir.logit), 
                      Sigma=vcov(dir.logit))

## Get SEs using quasi-Bayesian simulation
# setting up
load("ms-g11-precinct-vote-data.RData")
load("population_data.RData")
# Create county_precinct variable with no missing values
vote <- vote[is.na(vote$precinct)==F,]
vote$county_precinct <- with(vote, factor(paste(COUNTY, precinct, sep="/")))
vote$county_precinct_numeric <- as.numeric(vote$county_precinct)

# Create county-precinct variable in population data
pop.full$county_precinct <- with(pop.full, factor(paste(county, precinct, sep="/")))

# Merging numeric county-precinct variable from voting data and restricting population data to sampled precincts
pop <- merge(subset(vote, select=c(county_precinct, county_precinct_numeric)), pop.full, by="county_precinct")

x.pop <- pop
x.pop <- subset(x.pop, select=c(county_precinct_numeric, dem, gop, maleNA, ageNA))
x.pop <- data.frame(rep(1,nrow(x.pop)), x.pop)

n <- 1:length(vote$county_precinct_numeric)

prec.pred.noaux <- array(NA, c(3,length(n)))
se.noaux <- array(NA, c(1,length(n)))

counter <- 0

for (i in n) {
  
  # Create individual-level matrix of covariates for each county
  x <- x.pop[x.pop$county_precinct_numeric==i,]
  x <- x[,-2]  
  prec.pred.noaux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(coef.draws)), na.rm=T), probs=c(.5, .025, .975))
  se.noaux[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(coef.draws)), na.rm=T))
  counter <- counter + 1
  if (counter == 1) cat("Iteration: \n", counter) else cat(", ", counter)
}

write.table(prec.pred.noaux, file="prec_pred_noaux_dir_list_comp.txt")
write.table(se.noaux, file="prec_pred_noaux_dir_list_se_comp.txt")

## DIRECT QUESTIONING, USING RR SAMPLE SIZE
dir.logit <- glm(dir ~ dem + gop + maleNA + age55plus + ageNA, family=binomial(link=logit), data=dir.rr)

# Beta matrix is nsims x 5
coef.draws <- mvrnorm(n=1000, 
                      mu=coef(dir.logit), 
                      Sigma=vcov(dir.logit))

## Get SEs using quasi-Bayesian simulation
# setting up
x.pop <- pop
x.pop <- subset(x.pop, select=c(county_precinct_numeric, dem, gop, maleNA, age55plus, ageNA))
x.pop <- data.frame(rep(1,nrow(x.pop)), x.pop)

n <- 1:length(vote$county_precinct_numeric)

prec.pred.noaux <- array(NA, c(3,length(n)))
se.noaux <- array(NA, c(1,length(n)))

counter <- 0

for (i in n) {
  
  # Create individual-level matrix of covariates for each county
  x <- x.pop[x.pop$county_precinct_numeric==i,]
  x <- x[,-2]  
  prec.pred.noaux[,i] <- quantile(colMeans(invlogit(as.matrix(x) %*% t(coef.draws)), na.rm=T), probs=c(.5, .025, .975))
  se.noaux[,i] <- sd(colMeans(invlogit(as.matrix(x) %*% t(coef.draws)), na.rm=T))
  counter <- counter + 1
  if (counter == 1) cat("Iteration: \n", counter) else cat(", ", counter)
}

write.table(prec.pred.noaux, file="prec_pred_noaux_dir_rr_comp.txt")
write.table(se.noaux, file="prec_pred_noaux_dir_rr_se_comp.txt")

## DIRECT QUESTIONING, USING ENDORSEMENT SAMPLE SIZE
load("dir-data.RData")

#  DIRECT
dir <- dir[is.na(dir$dir)==F,]
dir <- dir[is.na(dir$county_precinct)==F,]

d1 <- subset(dir, condition=="D")  
d <- subset(dir, condition!="D")

# SAMPLE SIZE EQUAL TO ENDORSEMENT
nrow(end)
n <- nrow(end)-nrow(d1)
d <- d[sample(1:nrow(d), n, replace=TRUE),] 
dir.end <- data.frame(rbind(d1, d))
dim(dir.end)
dir.end$county_precinct <- factor(dir.end$county_precinct)

load("population_data.RData")
pop.full$county_precinct <- with(pop.full, factor(paste(county, precinct, sep="/")))

dir.end <- subset(dir.end, county_precinct %in% pop.full$county_precinct)
dir.end$county_precinct <- as.character(dir.end$county_precinct)

pop <- subset(pop.full, county_precinct %in% dir.end$county_precinct)
pop$county_precinct <- as.character(pop$county_precinct)

# Probit model with precinct random effects
prec.fit  <- glmer(dir ~ (1|county_precinct), 
                   data = dir.end, 
                   family = binomial("probit"))
prec.coef <- ranef(prec.fit)$county_precinct[, 1]
prec.se <- se.ranef(prec.fit)$county_precinct[, 1]


X <- as.matrix(model.matrix(~county_precinct, data = pop))
b <- mvrnorm(n = 1000, mu = prec.coef, Sigma = diag(prec.se^2))
y.mat <- pnorm(X%*%t(b))
prec.preds <- apply(y.mat, 2, tapply, INDEX = pop$county_precinct, mean)
se.dir.end <- mean(apply(prec.preds, 1, sd))
se.dir.end


## Analysis
dir <- read.table("prec_pred_noaux_dir_list_se_comp.txt")
se.dir.list <- mean(as.numeric(dir))

list <- read.table("prec_pred_noaux_se_list.txt")
se.list.noaux <- mean(as.numeric(list))

list <- read.table("prec_pred_aux_county_se_list.txt")
se.list.aux <- mean(as.numeric(list))

dir <- read.table("prec_pred_noaux_dir_rr_se_comp.txt")
se.dir.rr <- mean(as.numeric(dir))

rr <- read.table("prec_pred_noaux_se_rr.txt")
se.rr.noaux <- mean(as.numeric(rr))

rr <- read.table("prec_pred_aux_county_se_rr.txt")
se.rr.aux <- mean(as.numeric(rr))

## How much efficiciency do we recoup by adding auxiliary information?
table1 <- rbind(cbind(se.dir.list, NA, se.dir.rr, NA, se.dir.end, NA),
      cbind(se.list.noaux, se.list.noaux/se.dir.list, se.rr.noaux, se.rr.noaux/se.dir.rr, se.end.noaux, se.end.noaux/se.dir.end),
      cbind(se.list.aux, se.list.aux/se.dir.list, se.rr.aux, se.rr.aux/se.dir.rr, se.end.aux, se.end.aux/se.dir.end)
)

xtable(table1, digits=3)

